#########################################################################   
#               INFO                                                    #   
#########################################################################  

  # PROJECT: Local Governance in Tunisia
  # PURPOSE: Create (new) municipal-level variables for election data 
  # CREATED: May 2020 by Julia Clark
  # INPUTS:  mun_new_list_all.xlsx, parties_master.xlsx, 
  #          population_sector_2014.xlsx, mun_id.xlsx, admin_master_ID.xlsx
  # OUTPUTS: elections_mun_variables_long.xlsx, elections_mun_variables_wide.xlsx, 
  #          elections_mun_list_variables.xlsx 

#########################################################################   
#               SETUP                                                   #   
#########################################################################

  ######## ENVIRONMENT

  rm(list = ls()) 

  ######## SET WORKING DIRECTORY TO LOCATION WITH DATA FILES
  setwd("~/Desktop/replication_what_men_want/elections_rep/")

  ######## PACKAGES

  need <- c("openxlsx", "dplyr", "tidyr")
  have <- need %in% rownames(installed.packages()) # packages you have
  if(any(!have)) install.packages(need[!have]) # install missing packages
  invisible(lapply(need, library, character.only=T)) # load needed packages

  ######## SET ALL TABLES TO SHOW "NA" BY DEFAULT

  table = function (..., useNA = 'ifany') base::table(..., useNA = useNA)
  
  ######## FUNCTIONS
  
  # Identify columns where all values are NA
  all_na <- function(x) any(!is.na(x))

#########################################################################   
#               LOAD & CLEAN DATA                                       #   
#########################################################################

  ######## ELECTION RESULTS (2011, 2014, 2018, 2019)
  
  # Load merged municipal-list file
  mun <- read.xlsx("mun_new_list_all.xlsx") 
  nrow(mun) # 65631
  
  ######## PARTY INFO
  
  # Load data
  par <- read.xlsx("parties_rep.xlsx")
  
  # Select relevant variables from party info
  par_sel <- par %>% dplyr::select(list_short, rcd, islamist, 
                            group_fp, group_destourian, group_cpr, 
                            mun_2018, total_seats_2011_2019)
  
  # Merge election and party info
  nrow(mun) # 65631
  mun_party <- full_join(mun, par_sel, by = "list_short") %>%
    filter(!is.na(year))
  nrow(mun_party) # 65631
  
  ######## POPULATION BY MUNICIPALITY
  
  # Load municipality population data (aggregated from sectors)
  secpop <- read.xlsx("population_mun_2014.xlsx") 
  
  # Merge detailed population breakdown into municipal data
  mun_party <- full_join(mun_party, secpop, by = "mun_uid")
  nrow(mun_party) # 65631
  
  ######## SEATS
  
  # Select relevant variables
  seats <- read.xlsx("mun_id.xlsx") %>% 
    dplyr::select(u_id, seats) 
  
  # Merge in number of municipal seats
  mun_party <- mun_party %>% full_join(seats, by = c("mun_uid" = "u_id"))
  
#########################################################################   
#               LIST-LEVEL VARIABLES : DISTRICT & NATIONAL METRICS      #   
#########################################################################     
  
  # NOTE: Have to calculate these vars before averaging municipal results
  # across districts for the two muns that span districts (Tunis and Sfax)
  
  elec_lists <- mun_party
  nrow(elec_lists) # 65631
  
  ######## NUMBER OF DISTRICTS IN WHICH LISTS RAN [# - 2011,2014,2019]
  
  # Count number of districts in which non-indie lists ran
  dis_per_list <- elec_lists %>%
    filter(list_type != "independent" & year != "2018") %>%
    dplyr::select(year, dis_en, list_short) %>% 
    distinct() %>%
    group_by(year, list_short) %>%
    count() %>%
    rename(n_dis = n) %>%
    filter(!is.na(list_short)) %>%
    ungroup()
  
  # Merge and replace with "1" for independents
  elec_lists <- full_join(elec_lists, dis_per_list, by = c("year", "list_short"))  %>%
    mutate(n_dis = ifelse(list_type == "independent", 1, n_dis)) %>%
    mutate(n_dis = ifelse(year == "2014_prez1" | year == "2014_prez2", 27, n_dis))
  nrow(elec_lists) # 65631
  
  ######## NUMBER OF MUNICIPALITIES IN WHICH A LIST RAN [# - 2018]
  
  # Count number of municipalities in which each non-indie list ran
  mun_per_list <- elec_lists %>%
    filter(list_type != "independent" & year == "2018") %>%
    group_by(year, list_short) %>%
    count() %>%
    rename(n_mun = n) 
  
  # Merge and replace with "1" for independents
  elec_lists <- full_join(elec_lists, mun_per_list, by = c("year", "list_short"))  %>%
    mutate(n_mun = ifelse(list_type == "independent" & year == "2018", 1, n_mun)) 
  nrow(elec_lists) # 65631
  
  ######## LISTS PER DISTRICT 
  
  # Overall number of lists [# - ALL]
  elec_lists <- elec_lists %>% group_by(year, dis_en, mun_uid) %>% 
    mutate(n_lists = n_distinct(list_name_ar)) %>% as.data.frame()
  
  # Seats-to-list ratio [% - 2018]
  elec_lists <- elec_lists %>%
    mutate(seats_lists = ifelse(year == "2018", seats/n_lists, NA))
  
  # Count list class
  n_by_class <- elec_lists %>% 
    group_by(year, dis_en, mun_uid, list_short) %>%
    summarise(n = n()) %>%
    spread(list_short, n) %>% 
    rename(n_lists_indie = "<NA>",
           n_lists_en = "Ennahdha",
           n_lists_ni = "Nidaa") %>%
    mutate(n_lists_indie = ifelse(is.na(n_lists_indie), 0, n_lists_indie)) %>%
    ungroup()
  
  # Count third party
  n_lists_third <- n_by_class %>%
    dplyr::select(AA:ZYFP, -c(n_lists_ni, n_lists_en)) %>%
    rowSums(na.rm=TRUE)
  n_by_class <- cbind(n_by_class, n_lists_third) %>%
    mutate(n_lists_third = ifelse(is.na(n_lists_third), 0, n_lists_third)) %>%
    dplyr::select(year, dis_en, mun_uid, n_lists_en, n_lists_ni, n_lists_third, n_lists_indie)
    
  # Merge back into results
  elec_lists <- elec_lists %>% full_join(n_by_class, by = c("year", "dis_en", "mun_uid")) 
  nrow(elec_lists) # 65631
  
  # Proportion of different types of lists
  elec_lists <- elec_lists %>% 
    # Prop. independent lists
    mutate(per_lists_indie = n_lists_indie/n_lists,
           # Prop third party lists
           per_lists_third = n_lists_third/n_lists,
           # Ratio of independent to third party
           lists_ind_third = n_lists_indie/n_lists_third)
  
#########################################################################   
#               LIST-LEVEL VARIABLES : VOTESHARE                        #   
#########################################################################   
  
  ######## LIST MUNICIPAL VOTE SHARE [% - ALL]
  
  # Votes cast per list as a percent of valid votes [2011, 2014 parl] or 
  # a as a percent of list ballots when we don't have valid [2014 prez,
  # 2018, 2019]
  
  elec_lists <- elec_lists %>% 
    mutate(votes_per = votes/valid_ballots) %>%
    mutate(votes_per = ifelse(is.na(votes_per), votes/list_ballots, votes_per))
  
  ######## LIST MUNICIPAL VOTE SHARE CENTERED (MINUS AVERAGE IN NAT/DIS) [% - ALL]
  
  # Calculate average vote share across municipalities in district
  dis_mean <- elec_lists %>%
    filter(list_type != "independent") %>%
    group_by(year, dis_en, list_short) %>% 
    dplyr::summarize(dis_votes_per_mean = mean(votes_per, na.rm = T))
  
  # Calculate average vote share across all municipalities
  nat_mean <- elec_lists %>%
    filter(list_type != "independent") %>%
    group_by(year, list_short) %>% 
    dplyr::summarize(nat_votes_per_mean = mean(votes_per, na.rm = T))
  
  # Merge into full data
  elec_lists <- elec_lists %>%
    full_join(dis_mean, by = c("year", "dis_en", "list_short")) %>%
    full_join(nat_mean, by = c("year", "list_short")) 
  nrow(elec_lists) # 65631
  
  # Take difference between municipal vote share and average votes share within district/nation
  elec_lists <- elec_lists %>%
    mutate(votes_per_dis_mean = votes_per - dis_votes_per_mean,
           votes_per_nat_mean = votes_per - nat_votes_per_mean) %>%
    dplyr::select(-c(dis_votes_per_mean, nat_votes_per_mean))
  
  ######## LIST MUNICIPAL VOTESHARE vs. DISTRICT VOTESHARE [% - 2011,2014,2019]
  
  # Calculate each lists' overall voteshare in the district 
  dis_vote <- elec_lists %>%
    filter(year != "2018") %>%
    group_by(year, dis_en, list_name_ar) %>%
    dplyr::summarize(dis_votes = sum(votes),
              list_ballots = sum(list_ballots),
              cancelled_ballots = sum(cancelled_ballots), 
              blank_ballots = sum(blank_ballots),
              ) %>%
    mutate(dis_votes_per = dis_votes/(list_ballots + cancelled_ballots + blank_ballots)) %>%
    mutate(dis_votes_per = ifelse(is.na(dis_votes_per), dis_votes/list_ballots, dis_votes_per)) %>%
    dplyr::select(-c(list_ballots, cancelled_ballots, blank_ballots))
  
  # Merge 
  elec_lists <- full_join(elec_lists, dis_vote, by = c("year", "dis_en", "list_name_ar"))
  nrow(elec_lists) # 65631
  
  # Calculate difference between municipal and district voteshare
  elec_lists <- mutate(elec_lists, dis_vote_diff = votes_per - dis_votes_per) %>%
    dplyr::select(-dis_votes_per)
  
  # Calculate the percent of district-wide votes each list won in the municipality
  elec_lists <- mutate(elec_lists, dis_vote_per = votes/dis_votes) %>%
    dplyr::select(-dis_votes)
  
  ######## LIST MUNICIPAL VOTESHARE vs. NATIONAL VOTESHARE [% - ALL]
  
  # Calculate each lists' overall voteshare in across the nation (non-indie)
  nat_vote <- elec_lists %>%
    filter(list_type != "independent") %>%
    group_by(year, list_short) %>%
    dplyr::summarize(nat_votes = sum(votes),
              list_ballots = sum(list_ballots),
              cancelled_ballots = sum(cancelled_ballots), 
              blank_ballots = sum(blank_ballots)) %>%
    mutate(nat_votes_per = nat_votes/(list_ballots + cancelled_ballots + blank_ballots)) %>%
    mutate(nat_votes_per = ifelse(is.na(nat_votes_per), nat_votes/list_ballots, nat_votes_per)) %>%
    dplyr::select(-c(list_ballots, cancelled_ballots, blank_ballots)) 
  
  # Merge 
  elec_lists <- full_join(elec_lists, nat_vote, by = c("year", "list_short"))
  nrow(elec_lists) # 65631
  
  # Calculate difference between municipal and district voteshare
  elec_lists <- mutate(elec_lists, nat_vote_diff = votes_per - nat_votes_per) %>%
    dplyr::select(-nat_votes_per)
  
  # Calculate percent of nation-wide voteshare from municipality
  elec_lists <- mutate(elec_lists, nat_vote_per = votes/nat_votes) %>%
    dplyr::select(-nat_votes)
  
  ######## LIST ABOVE NATIONAL MEAN VOTESHARE [0/1 - ALL]
  
  # Calculate whether list above MEAN national voteshare
  elec_lists <- mutate(elec_lists, above_mean = ifelse(votes_per_nat_mean >=0, 1, 0)) 
  
  # Calculate whether list above MEDIAN national voteshare
  med <- elec_lists %>%
    filter(list_type != "independent") %>%
    group_by(year, list_short) %>% 
    dplyr::summarize(nat_votes_per_med = median(votes_per, na.rm = T))
  
  # Merge into full data
  elec_lists <- elec_lists %>%
    full_join(med, by = c("year", "list_short")) %>%
    mutate(above_median = ifelse(votes_per >= nat_votes_per_med, 1, 0)) %>%
    dplyr::select(-nat_votes_per_med)
  
  nrow(elec_lists) # 65631
  
#########################################################################   
#               LIST-LEVEL VARIABLES : COMPETITION                      #   
#########################################################################    
  
  ######## OVERALL MARGIN OF DIFFERENCE [%]
  
  # Find first and second place & calculate margin of difference
  ranks <- elec_lists %>%
    group_by(year, dis_en, mun_uid) %>%
    top_n(2, votes_per) %>% # select top 2
    mutate(first = max(votes_per),
           second = min(votes_per),
           margin_diff = first - second) %>%
    dplyr::select(year, dis_en, mun_uid, margin_diff, first, second) %>%
    distinct()
  
  # Merge 
  elec_lists <- full_join(elec_lists, ranks, by = c("year", "dis_en", "mun_uid"))
  nrow(elec_lists) # 65631
  
  ######## MARGIN OF DIFFERENCE FOR EACH LIST [%]
  
  # Calculate difference between list voteshare and second place list
  elec_lists <- elec_lists %>% 
    mutate(list_margin = votes_per - first) %>%
    mutate(list_margin = ifelse(list_margin == 0, votes_per - second, list_margin)) %>%
    mutate(list_margin2 = votes_per - second) %>%
    mutate(list_margin3 = votes_per - first) %>%
    mutate(list_margin4 = 1 - abs(votes_per - first)) %>%
    dplyr::select(-c(first, second)) 
  nrow(elec_lists) # 65631

  ######## EFFECTIVE NUMBER OF LISTS [#]  
  
  # Herfindahl index for number of competitive lists in mun
  enl <- elec_lists %>%
    group_by(year, dis_en, mun_uid) %>%
    summarise(enl = 1/sum(votes_per^2, na.rm = T))   
  
  # Merge 
  elec_lists <- full_join(elec_lists, enl, by = c("year", "dis_en", "mun_uid"))  
  nrow(elec_lists) # 65631  
  
#########################################################################   
#               LIST-LEVEL VARIABLES : MUN SEATS WONS                   #   
#########################################################################     
  
  # Calculate denominator (# of votes in mun; quota should be valid votes, 
  # but this doesn't exist for all years)
  
  munvotes <- elec_lists %>%
    dplyr::select(year, dis_en, mun_uid, list_ballots) %>%
    distinct() %>%
    group_by(year, dis_en, mun_uid) %>%
    summarise(mun_votes = sum(list_ballots))
  
  # Calculate round 1 seats won per list
  elec_lists <- elec_lists %>% 
    # Merge in number of seats
    full_join(munvotes, by = c("year", "dis_en", "mun_uid")) %>%
    # Per-municipality quota 
    mutate(mun_quota = mun_votes/seats,
           # Number of quotas won by the list
           list_quota = votes/mun_quota,
           # Round 1 seats won by the party (round down to whole number)
           list_seats_r1 = floor(list_quota),
           # Remainder of list's quota value (minus number of seats it already won)
           list_remainder = list_quota - list_seats_r1) %>%
    group_by(year, dis_en, mun_uid) %>%
    # List rank by size of remainder
    mutate(remainder_rank = rank(-list_remainder)) %>%
    data.frame()
  
  # Calculate total seats assigned in round 1 by district
  mun_r1 <- elec_lists %>%
    group_by(year, dis_en, mun_uid) %>%
    summarise(r1_seats = sum(list_seats_r1)) %>%
    data.frame()
  
  # Calculate round 2 and total seats by list
  elec_lists <- elec_lists %>% full_join(mun_r1, by = c("year", "dis_en", "mun_uid")) %>%
    # Number of seats remaining in municipality
    mutate(r2_seats = seats - r1_seats,
           # Assign remaining seats to the top-ranked lists 
           list_seats_r2  = ifelse(remainder_rank <= r2_seats, 1, 0),
           # Total number of seats won by list in district
           list_seats_n = list_seats_r1 + list_seats_r2) 
  
  # Proportion of seats won
  elec_lists <- mutate(elec_lists, list_seats_per = list_seats_n/seats)
  nrow(elec_lists) # 65631
  
#########################################################################   
#               MUN-LEVEL VARIABLES : SELECT AND COLLAPSE BY DIS        #   
#########################################################################   
  
  ####### SELECT MUNICIPAL-LEVEL VARIABLES
  
  mun_sel <- elec_lists %>% 
    dplyr::select(year, gov_en, dis_en, mun_uid, mun_en_2018, 
           contains("_ballots"), contains("voters"), contains("lists"),
           contains("reg_"), contains("turn_"), contains("pop"),
           margin_diff, enl) %>%
    distinct() 
  nrow(mun_sel) # 2103
  
  ######## SUM/AVERAGE MUNICIPAL-LEVEL VARIABLES FOR TUNIS AND SFAX
  
  # NOTE: Lose district-level variables after this point
  
  # Sum count variables
  mun_sums <- mun_sel %>% 
    group_by(year, gov_en, mun_uid, mun_en_2018, tot_pop, tot_pop_f, tot_pop_m) %>%
    summarize_at(vars(contains("_ballots"), contains("voters"), contains("reg_")), sum)
  nrow(mun_sums) # 2093
  
  # Take mean of percent variables + number of lists and competition measures 
  mun_mean <- mun_sel %>% 
    group_by(year, mun_uid) %>%
    summarize_at(vars(contains("turn_"), contains("lists"), enl, margin_diff), mean) 
  nrow(mun_mean) # 2093
  
  # Merge together
  muns <- full_join(mun_sums, mun_mean, by = c("year", "mun_uid")) %>%
    ungroup()
  nrow(muns) # 2093
  
#########################################################################   
#               MUN-LEVEL VARIABLES : RESULTS                           #   
######################################################################### 
  
  mun_per <- muns
  nrow(mun_per) # 2093 [missing 7 municipalities in Gafsa for 2014, should be 2100]
  
  ######## LIST BALLOTS [% - ALL]  
  
  # List ballots as percent of total votes in municipality
  mun_per <- mun_per %>% mutate(list_per = list_ballots/total_voters)  
  
  ######## BLANK BALLOTS CAST [% - ALL]
  
  # Blank votes as percent of valid votes in municipality
  # Could change to blank + canceled ("protest")
  mun_per <- mutate(mun_per, blank_per = blank_ballots/valid_ballots) 
  
  ######## TURNOUT BY REGISTERED [% - ALL]  
  
  # Signing voters as percent of registered voters in municipality [2011, 2014 parl]
  # for years without total voters (or valid), using list voter [2014 prez, 2018, 2019]
  mun_per <- mun_per %>% 
    mutate(turnout_per = total_voters/registered_voters) %>%
    mutate(turnout_per = ifelse(is.na(turnout_per), list_ballots/registered_voters, turnout_per)) %>%
    mutate(turnout_per_list = list_ballots/registered_voters)
  
  # Check difference
  library(ggplot2)
  ggplot(mun_per) + 
    #ggplot(filter(mun_per, year == "2014_parl")) + 
    geom_histogram(aes(x=turnout_per), fill = "magenta", alpha = .3, bins = 30) + 
    geom_histogram(aes(x=turnout_per_list), alpha = .3, fill = "blue", bins = 30) 
  
  ######## TURNOUT BY POPULATION [% - ALL]  
  
  # Signing voters as percent of total population [2014 census]
  mun_per <- mun_per %>% 
    mutate(turnout_per_pop = total_voters/tot_pop) %>%
    mutate(turnout_per_pop = ifelse(is.na(total_voters), list_ballots/tot_pop, turnout_per_pop)) %>%
    mutate(turnout_per_pop_list = list_ballots/tot_pop)
  
  # Check difference
  ggplot(mun_per) + 
  #ggplot(filter(mun_per, year == "2014_parl")) + 
    geom_histogram(aes(x=turnout_per_pop), fill = "magenta", alpha = .3, bins = 30) + 
    geom_histogram(aes(x=turnout_per_pop_list), alpha = .3, fill = "blue", bins = 30) 
  
  ######## TURNOUT BY GENDER [% - 2011, 2014 parl]  
  
  # Turnout by gender (as a percent of underlying population)
  # Note: don't have total voters by gender, have to back out using percents, 
  # Some of population will be under voting age
  mun_per <- mun_per %>% 
    mutate(turnout_per_pop_f = total_voters*turn_women/tot_pop_f,
           turnout_per_pop_m = total_voters*turn_men/tot_pop_m,
           turnout_per_pop_gap = turnout_per_pop_m - turnout_per_pop_f) 
  
  ######## REGISTRATION BY DEMOGRAPHIC GROUPS [% - 2018 and 2019]  
  
  # Percent of registered voters who are female, young
  mun_per <- mun_per %>%
    mutate(reg_women_per = reg_women/registered_voters) %>%
    mutate(reg_18to25_per = (reg_18to25_m + reg_18to25_f)/registered_voters) %>%
    mutate(reg_26to45_per = (reg_26to45_m + reg_26to45_f)/registered_voters) %>%
    mutate(reg_18to35_per = reg_18to35/registered_voters) %>%
    mutate(reg_36to45_per = reg_36to45/registered_voters) %>%
    mutate(reg_young_women_per = (reg_18to25_f + reg_26to45_f)/registered_voters) %>%
    mutate(reg_18to45_per = 
             (reg_18to25_m + reg_18to25_f + reg_26to45_m + reg_26to45_f)/registered_voters) %>%
    mutate(reg_18to45_per = 
             ifelse(is.na(reg_18to45_per), (reg_18to35 + reg_36to45)/registered_voters, reg_18to45_per)) %>%
    data.frame()
  nrow(mun_per) # 2093
  
#########################################################################   
#               MUN-LEVEL VARIABLES : CALC TYPES OF LISTS               #   
#########################################################################    
  
  # NOTE: THIS SECTION CALCULATES VOTE PERCENTS FOR SPECIFIC LIST TYPES 
  # TO AGGREGATE AT THE MUNICIPAL LEVEL
  
  elec_type <- elec_lists
  
  ######## INDEPENDENTS
  
  # Calculate voteshare for independents
  indie <- elec_type  %>%
    filter(list_type == "independent") %>%
    group_by(year, dis_en, mun_uid) %>%
    summarise(indie_votes_per = sum(votes_per, na.rm = T))
  
  ######## LOCAL LISTS 
  
  # //// NOTE: MAY NEED TO CHANGE MEASURE FOR DIFF YEARS 
  summary(elec_type$n_dis[elec_type$year == "2011"]) # median = 9, mean = 11.56
  summary(elec_type$n_dis[elec_type$year == "2014_parl"]) # median = 12, mean = 13.35
  summary(elec_type$n_dis[elec_type$year == "2019_parl"]) # median = 16, mean = 14.17
  
  # Code lists as being "local" if they ran in X or fewer districts
  local_cutoff <- 3
  elec_type <- elec_type %>% 
    mutate(local_list = ifelse(year != "2018" & n_dis <= local_cutoff, 1,
                               ifelse(year == "2018" & n_mun <= local_cutoff, 1, 0))) %>%
    dplyr::select(-c(n_dis, n_mun))
  
  # Calculate voteshare for local lists (indie + those running in <=3 districts)
  local <- elec_type  %>%
    filter(local_list == 1) %>%
    group_by(year, dis_en, mun_uid) %>%
    summarise(local_votes_per = sum(votes_per))
  
  ######## THIRD PARTIES
  
  # Party lists other than Ennahdha and Nidaa (not including indie)
  # Coding as NA for 2011 (Nidaa didn't exist) and 2014 prez (no Ennahdha candidate)
  elec_type <- elec_type %>% mutate(
    third_party = ifelse(list_short == "Ennahdha" | list_short == "Nidaa", 0, 1),
    third_party = ifelse(list_type == "independent", 0, third_party),
    third_party = ifelse(year == "2011" | year == "2014_prez1" | year == "2014_prez2",
                         NA, third_party))  
  table(elec_type$third_party, elec_type$year)
  
  # Calculate voteshare for third parties (non-Ennahdha and Nidaa)
  third <- elec_type  %>%
    filter(third_party == 1) %>%
    group_by(year, dis_en, mun_uid) %>%
    summarise(third_votes_per = sum(votes_per))
  
  ######## ISLAMIST LISTS
  
  # Change Ennahdha to NA value for Islamist (want non-Ennahdha Islamist votes)
  elec_type <- elec_type %>%
    mutate(islamist = ifelse(list_short == "Ennahdha", NA, islamist))
  
  # Calculate voteshare for islamist parties/coalitions
  islamist <- elec_type  %>%
    filter(islamist == "Y") %>%
    group_by(year, dis_en, mun_uid) %>%
    summarise(islamist_votes_per = sum(votes_per))
  
  ######## RCD-AFILLIATED LISTS
  
  # Calculate voteshare for islamist parties/coalitions
  rcd <- elec_type  %>%
    filter(rcd == "Y") %>%
    group_by(year, dis_en, mun_uid) %>%
    summarise(rcd_votes_per = sum(votes_per))
  
  ######## PARTY GROUPS
  
  # Select different party groups
  groups <- elec_type %>%
    filter(group_fp == "Y" | group_destourian == "Y" | group_cpr == "Y") %>%
    mutate(list_short = case_when(group_fp == "Y" ~ "FP_grp",
                                  group_destourian == "Y" ~ "Destouri_grp",
                                  group_cpr == "Y" ~ "CPR_grp")) 
  
  # Take means of national/dis share and list margin variables  
  groups_mean <- groups %>%
    group_by(year, dis_en, mun_uid, list_short) %>%
    summarize_at(vars(contains("_vote_"), contains("list_margin")), mean)
  
  # Take sum of vote share and join with means
  groups_sum <- groups %>%
    group_by(year, dis_en, mun_uid, list_short) %>%
    summarize_at(vars(contains("votes_per")), sum) %>%
    full_join(groups_mean, by = c("year", "dis_en", "mun_uid", "list_short"))
  
  # Transform to long then wide
  groups_wide <- groups_sum %>%
    gather("variable", "value", votes_per:names(groups_sum[ncol(groups_sum)])) %>%
    group_by(year, dis_en, mun_uid) %>%
    unite(key, list_short, variable) %>%
    spread(key, value)
  dim(groups_wide) # 1938 x 36
  
  ######## LISTS THAT RAN IN 2018 OR WON PARL SEATS
  
  # Join collapsed groups to other important lists
  imp <- elec_type %>%  
    filter(mun_2018 == "Y" | total_seats_2011_2019 > 10) %>%
    dplyr::select(year, dis_en, mun_uid, list_short, contains("votes_per"), contains("_vote_"), contains("list_margin")) 
  nrow(imp) # 18611
  
  # Transform to long then wide
  imp_wide <- imp %>%
    gather("variable", "value", votes_per:names(imp[ncol(imp)])) %>%
    group_by(year, dis_en, mun_uid) %>%
    unite(key, list_short, variable) %>%
    spread(key, value)
  dim(imp_wide) # 2103 x 366
  
  ######## MERGE TOGETHER
  
  types <- indie %>%
    full_join(local, by = c("year", "dis_en", "mun_uid")) %>%
    full_join(third, by = c("year", "dis_en", "mun_uid")) %>%
    full_join(islamist, by = c("year", "dis_en", "mun_uid")) %>%
    full_join(rcd, by = c("year", "dis_en", "mun_uid"))   %>%
    full_join(groups_wide, by = c("year", "dis_en", "mun_uid")) %>%  
    full_join(imp_wide, by = c("year", "dis_en", "mun_uid"))  %>%
    as.data.frame()
  dim(types) # 2103 x 404
  
  ######## AVERAGE LIST TYPES ALL ACROSS MUNICIPALITY
  
  # Take means across municipalities for Tunis and Sfax
  types_mean <- types %>%
    group_by(year, mun_uid) %>%
    summarize_at(vars(contains("vote")), mean) 
  nrow(types_mean) # 2093
  
  ######## MERGE IN WITH MUNICIPAL LEVEL DATA
  
  mun_type <- full_join(mun_per, types_mean, by = c("year", "mun_uid"))
  nrow(mun_type) # 2093
  
#########################################################################   
#               TRANSFORM MUNICIPAL VARIABLES TO LONG AND WIDE          #   
#########################################################################    
  
  ######## TRANSFORM TO LONG AND WIDE X YEAR
  
  # Transform to long 
  mun_long <- mun_type %>%
    dplyr::select(-c(tot_pop_f, tot_pop_m)) %>%
    gather("variable", "value", valid_ballots:names(mun_type[ncol(mun_type)])) %>%
    filter(!is.na(value))
  nrow(mun_long) # 220361
  
  # Transform to wide
  mun_wide <- mun_long %>%
    unite(key, variable, year) %>%
    group_by(gov_en, mun_uid, mun_en_2018, tot_pop) %>%
    spread(key, value) %>%
    dplyr::select_if(all_na) %>% 
    ungroup() %>%
    as.data.frame()
  dim(mun_wide) # 350 x 823
  
#########################################################################   
#               SELECT LIST-LEVEL VARIABLES TO SAVE                     #   
#########################################################################    
  
  lists <- elec_type %>%
    dplyr::select(year, gov_en, dis_en, mun_uid, mun_en_2018, 
           list_type, list_name_ar, list_name_en, list_short, list_order, cand_name_en,
           votes, votes_per, votes_per_dis_mean, votes_per_nat_mean, list_seats_n, list_seats_per,
           dis_vote_diff, dis_vote_per, nat_vote_diff, nat_vote_per, contains("list_margin"), contains("list_seats"),
           contains("above"), local_list, third_party, islamist, rcd, group_fp, group_destourian, group_cpr)
  nrow(lists) # 65631  
  
#########################################################################   
#               SAVE                                                    #   
#########################################################################  
  
  ######## WRITE FILES TO FOLDER
  
  write.xlsx(mun_long, "~/Desktop/replication_what_men_want/data/elections_mun_new_variables_long.xlsx")
  write.xlsx(mun_wide, "~/Desktop/replication_what_men_want/data/elections_mun_new_variables_wide.xlsx")
  write.xlsx(lists, "~/Desktop/replication_what_men_want/data/elections_mun_list_variables.xlsx")
  
  
  